---
title: "R Notebook"
output: html_notebook
---

# Setup
```{r}

#install/load packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  readxl,
  readstata13,
  tidyverse,
  pdftools,
  quanteda,
  xtable
)


# globals
options(scipen=999)

#load data

# main data set
df_restricted = read.dta13("replication_data/dataset_restricted.dta", nonint.factors = TRUE)
df_full = read.dta13("replication_data/dataset_final.dta", nonint.factors = TRUE)

# chapel hill expert survey
ches <- read.csv("replication_data/CHES_means_1999-2014.csv", stringsAsFactors = FALSE)
ches17 <- read.csv("replication_data/CHES_means_2017.csv", stringsAsFactors = FALSE)

# party manifestoes
url_cons <- "https://s3.eu-west-2.amazonaws.com/conservative-party-manifestos/Forward+Together+-+Our+Plan+for+a+Stronger+Britain+and+a+More+Prosperous....pdf"
url_labour <- "https://labour.org.uk/wp-content/uploads/2019/11/Real-Change-Labour-Manifesto-2019.pdf"

download.file(url_cons, "ForwardTogether.pdf", mode = "wb")
cons <- pdf_text("ForwardTogether.pdf")

download.file(url_labour, "RealChange.pdf", mode = "wb")
labour <- pdf_text("RealChange.pdf")
```



# FIGURE 1: Digitalization: ICT capital stock per employee, by industry
```{r}
df_restricted %>% 
  group_by(year, euklems_num) %>%
  ggplot(aes(x=year,y=ICT, na.rm = TRUE)) +
  geom_line()+ 
  scale_y_log10() +
  xlab("Year") +
  ylab("ICT per worker") +
  facet_wrap(~ euklems_num, ncol = 4) + 
  theme(strip.text.x = element_text(size = 6, colour = "black"),
        axis.text.x = element_text(color = "black", size = 6)) +
  ggsave(filename = "results_paper/fg1.eps", width = 14, height = 20, units = "cm") 
```



# FIGURE SI0.1: Average hourly net wage by education
```{r}
df_restricted %>% 
  filter(!is.na(hourly_wage))%>%
  group_by(year, edu) %>% 
  summarise(avg_wage = mean(hourly_wage, na.rm=T)) %>%
  ggplot(aes(x=year, 
             y=avg_wage, 
             group = edu)) +
  geom_line(size=2, 
            aes(color=edu))+ #, linetype =edu
  #ggtitle("Wage development over time by education level") +
  scale_color_discrete("Education level",
                       guide = guide_legend(reverse = TRUE)) +
  xlab("Year") +
  ylab("Average Hourly Wage in GBP") +
  ggsave(filename = "results_online_appendix/SI_avg_income_by_edu.eps", width = 14, height = 10, units = "cm")
```

# FIGURE SI0.2: Share unemployed by education
Use full sample here as many unemployed are not assigned to an industry and hence were excluded in the restricted sample.
```{r}
df_full %>%
  mutate(incomem = ifelse(euklems_num %in% c(39,40,"NA"),NA, incomem)) %>%
  filter(year >= 1997,
         age > 17,
         age < 65,
         !is.na(edu),
         edu != "NA") %>%
  group_by(year, edu) %>% 
  summarise(mean_unemployed = mean(unemployed, na.rm=T)) %>%
  ggplot(aes(x=year, 
             y=mean_unemployed, 
             group = edu)) +
  geom_line(size=2, 
            aes(color=edu))+ #, linetype =edu
  scale_color_discrete("Education level",
                       guide = guide_legend(reverse = F))+
  xlab("Year") +
  ylab("Share of unemployed") +
  scale_y_continuous(labels = scales::percent) +
  ggsave(filename = "results_online_appendix/SI_this_period_unemployed_all.eps", width = 14, height = 10, units = "cm")
```

# FIGURE SI0.3: Probability to become unemployed in the next period by education
```{r}
df_restricted %>%  
  filter( !(euklems_num %in% c(39,40,"NA",NA)),
          unemployed == 0,
          year <= 2016) %>%
  group_by(year, edu) %>% 
  summarise(mean_unemployed = mean(F_unemployed/100, na.rm=T)) %>%
  ggplot(aes(x=year, 
             y=mean_unemployed, 
             group = edu))+
  geom_line(size=2, 
            aes(color=edu))+ #, linetype =edu
  scale_color_discrete("Education level",
                       guide = guide_legend(reverse = F))+
  xlab("Year") +
  ylab("Probability to become unemployed") +
  scale_y_continuous(labels = scales::percent) +
  ggsave(filename = "results_online_appendix/SI_next_period_unemployed_sample.eps", width = 14, height = 10, units = "cm") 
```

# FIGURE SI0.4: Reported voter turnout by education
```{r}
df_restricted %>% 
  group_by(year, edu) %>%
  summarise(voted = mean(voted, na.rm = T)/100) %>% 
  ggplot(aes(x=year, y=voted))+
  geom_line(size=2)+
  geom_point() +
  xlab("Year") +
  scale_y_continuous(labels = scales::percent) +
  ylab("Turnout") +
  facet_wrap( ~ edu) +
  ggsave(filename = "results_online_appendix/SI_turnout_by_edu.eps", width = 14, height = 7, units = "cm")
```

# FIGURE SI0.5: Support for political parties by education
```{r}
df_restricted %>% 
  filter(year <= 2017) %>% 
  group_by(year, edu) %>%
  summarise(Conservatives = mean(cons, na.rm = TRUE) / 100,
            Labour = mean(labour, na.rm = TRUE) / 100,
            UKIP = mean(ukip, na.rm = TRUE) / 100,
            LibDem = mean(libdem, na.rm = TRUE) / 100) %>% 
  gather(key = "Party", value = "Vote_share",  Conservatives, Labour, UKIP, LibDem) %>%
  filter(Vote_share > 0) %>%
  ggplot(aes(x=year, y=Vote_share, fill=factor(Party), color = factor(Party))) +
  geom_line(size=2)+
  #ggtitle("Respondents' vote share by education level") +
  scale_color_manual("", values=c("blue", "red", "gold", "purple")) +
  theme(legend.position="bottom") +
  xlab("Year") +
  scale_y_continuous(labels = scales::percent) +
  ylab("Vote share") +
  facet_wrap( ~ edu) +
  ggsave(filename = "results_online_appendix/SI_vote_share_by_edu.eps", width = 14, height = 9, units = "cm") 
```

# FIGURE SI1.8: Party Positions over time
```{r}
uk <- ches %>% filter(country=="uk") %>% dplyr::select(year, party, lrgen, lrecon, vote)
uk$lrgen <- as.numeric(as.character(uk$lrgen))
uk$lrecon <- as.numeric(as.character(uk$lrecon))

uk17 <- ches17 %>% filter(country=="uk") %>% dplyr::select(year, party, lrgen, lrecon, vote)
uk17$lrgen <- as.numeric(as.character(uk17$lrgen))
uk17$lrecon <- as.numeric(as.character(uk17$lrecon))

uk_all <- rbind(uk, uk17)
uk_all$party[uk_all$party=="Cons"] <- "Conservative"
uk_all$party[uk_all$party=="Lab"] <- "Labour"
uk_all$party[uk_all$party=="CONS"] <- "Conservative"
uk_all$party[uk_all$party=="LAB"] <- "Labour"

uk_all %>%
  group_by(year) %>%
  mutate(wmeanlrecon=weighted.mean(lrecon, vote, na.rm=T), diff = lrecon-wmeanlrecon) %>%
  ungroup() %>%
  filter(party=="Conservative" | party=="Labour") %>%
  ggplot(aes(x=diff, y=year, group=party, shape=party)) +
  geom_point(size=3) +
  geom_vline(aes(xintercept=0), linetype="dotted") +
  xlab("Demeaned Left-Right Stance on Economic Issues") + ylab("") +
  scale_y_continuous(breaks=c(1999,2002,2006,2010,2014,2017)) +
  theme_bw() +
  theme(text = element_text(size=14),
        legend.title=element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_line(colour = "grey50",
                                          linetype = "solid")) +
ggsave(filename = "results_online_appendix/SI_ches_lrecon_weighted.eps", width = 14, height = 9, units = "cm")
```

# FIGURE SI1.9: Most frequent terms in manifesto
```{r}
# build a new corpus from the texts
corp_cons <- corpus(cons)  
corp_labour <- corpus(labour)

# combine pages into one corpus
corp_cons <- corpus(texts(corp_cons, groups = rep(1, ndoc(corp_cons))))
docvars(corp_cons, "Party") <- "Conservative"
names(corp_cons) <- "Corp_Cons"

corp_labour <- corpus(texts(corp_labour, groups = rep(1, ndoc(corp_labour))))
docvars(corp_labour, "Party") <- "Labour"
names(corp_labour) <- "Corp_Labour"

corp <- corpus(c(corp_cons, corp_labour))

toks <- tokens(corp, remove_punct = TRUE) 
dfmat <- dfm(toks, remove = c(stopwords("english"), "can", "make", "need", "also", "including", "new"))
 
freq <- textstat_frequency(dfmat, n = 30, groups = "Party")
freq <- as.data.frame(freq)

freq$color <- ifelse(freq$feature=="digital" | freq$feature=="technology", "red", "grey")

freq2 <- data.frame(feature=c("technolog*", "digital", "technolog*"), frequency=c(35, 15, 13), rank=c(32, 32, 33), docfreq=c(1,1,1), group=c("Conservative", "Labour", "Labour"), color=c("red", "red", "red"))

freq3 <- rbind(freq, freq2)

freq3 <- freq3 %>% arrange(group, rank)

ggplot(freq3, aes(y=frequency, x=rank)) + 
  geom_bar(position = 'dodge', stat="identity", fill=freq3$color) + 
  geom_text(aes(label=feature), position=position_dodge(width=0.9), angle=90) +
  facet_wrap(~group) + 
  ylab("Most Frequent Terms in Manifesto") + xlab("") +
  theme(axis.text.x = element_blank()) +
  ggsave(filename = "results_online_appendix/SI_word_frequency.eps", width = 14, height = 9, units = "cm")
```

# TABLE SI1.16: Keyword in Context ("Digital")
```{r}
kwic <- as.data.frame(kwic(corp_cons, pattern = "digital"))[, 4:6]

corp_kwic <- corpus(kwic(corp_cons, pattern = "digital"))
kwic_cons <- dfm(corp_kwic, remove = stopwords("english"), remove_punct = TRUE, remove_numbers = TRUE)

plot <- as.data.frame(topfeatures(kwic_cons, 30))
print(xtable(plot, type = "latex", digits=1), file = "results_online_appendix/SI_kwic_red.tex")
```